Abstract

I describe the effects that certain factors play over

Introduction

This data set was facilitated by Prosper

#Subset data to exclude ambiguous employment status, top 1% earners 
#and bottom 1%
loans2 <- subset(loans, !(EmploymentStatus %in% c('','Not available','Other')))
loans2 <- subset(loans2, StatedMonthlyIncome<quantile(StatedMonthlyIncome, 0.99) 
                 & StatedMonthlyIncome>quantile(StatedMonthlyIncome, 0.01) )

#Creating data frames with means and medians for Loan Amounts
loan.loan_by_income <- loans2 %>%
  group_by(IncomeRange) %>%
  summarise(mean_loan_amount=mean(LoanOriginalAmount),
            median_loan_amount=median(LoanOriginalAmount))

loan.loan_by_term <- loans2 %>%
  group_by(Term) %>%
  summarise(mean_loan_amount=mean(LoanOriginalAmount),
            median_loan_amount=median(LoanOriginalAmount))

mdata <- melt(loan.loan_by_income, id = c('IncomeRange'))
mdata2 <- melt(loan.loan_by_term, id = c('Term'))

#Counts for Employment Status
ggplot(loans2, aes(EmploymentStatus)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.5)

#Loan amounts
ggplot(loans2, aes(LoanOriginalAmount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Loans defaulted by Original Amount
ggplot(subset(loans2, LoanStatus == 'Defaulted'), aes(LoanOriginalAmount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Loans defaulted by Income Range
ggplot(subset(loans2, LoanStatus == 'Defaulted'), aes(IncomeRange)) +
geom_histogram(stat = 'count')+
  scale_x_discrete(limits=c('Not employed','$1-24,999','$25,000-49,999','$50,000-74,999',
                            '$75,000-99,999','$100,000+'))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 21 rows containing non-finite values (stat_count).

#Mean and median loan amount by income range
ggplot(mdata, aes(x = IncomeRange, value, fill = variable))+
  geom_bar(position = 'dodge', stat = 'identity')+
  scale_x_discrete(limits=c('Not employed','$1-24,999','$25,000-49,999','$50,000-74,999',
                            '$75,000-99,999','$100,000+'))
## Warning: Removed 4 rows containing missing values (geom_bar).

#Mean and median  amount by Term
ggplot(mdata2, aes(x = Term, value, fill = variable))+
  geom_bar(position = 'dodge', stat = 'identity')

#Loans by Income range
ggplot(loans2, aes(IncomeRange))+
  geom_bar()

#Loan Amount by Monthly Income
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount))+
  geom_point(alpha  = 0.05, position=position_jitter())

#Loan Amount by Monthly Income (Colored)
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount, color = IncomeRange))+
  geom_point(position=position_jitter())

#Loan Amount by Monthly Income (Term)
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount))+
  geom_point(alpha = 0.3, position=position_jitter())+
  facet_wrap(~Term)

#Loan amount by credit score
ggplot(loans2,aes(CreditScoreRangeLower, LoanOriginalAmount))+
  geom_point(alpha = 0.10)+
  geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Rate by credit score
ggplot(loans2,aes(CreditScoreRangeLower, BorrowerRate))+
  geom_point(alpha = 0.10)+
  geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#APR by credit score
ggplot(loans2,aes(CreditScoreRangeLower, BorrowerAPR))+
  geom_point(alpha = 0.10)+
  geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#LenderYield by BorrowerAPR
ggplot(loans2, aes(BorrowerAPR, LenderYield))+
  geom_point(alpha = 0.10)

#EstimatedLoss by BorrowerAPR
ggplot(loans2, aes(BorrowerAPR, EstimatedLoss))+
  geom_point(alpha = 0.10)
## Warning: Removed 20665 rows containing missing values (geom_point).

#LenderYield by EstimatedLoss
ggplot(loans2, aes(LenderYield, EstimatedLoss))+
  geom_point(alpha = 0.10)
## Warning: Removed 20665 rows containing missing values (geom_point).